-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[SDAG] Avoid creating redundant stack slots when lowering FSINCOS #108401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesWhen lowering Note: As a NFC this also adds Full diff: https://github.com/llvm/llvm-project/pull/108401.diff 4 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 7a131645893921..045ec7d3653119 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -62,6 +62,10 @@ Libcall getLDEXP(EVT RetVT);
/// UNKNOWN_LIBCALL if there is none.
Libcall getFREXP(EVT RetVT);
+/// getFSINCOS - Return the FSINCOS_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFSINCOS(EVT RetVT);
+
/// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
/// UNKNOWN_LIBCALL if there is none.
Libcall getSYNC(unsigned Opc, MVT VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f5fbc01cd95e96..7b0dc63c473d25 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2326,15 +2326,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
/// Return true if sincos libcall is available.
static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
- RTLIB::Libcall LC;
- switch (Node->getSimpleValueType(0).SimpleTy) {
- default: llvm_unreachable("Unexpected request for libcall!");
- case MVT::f32: LC = RTLIB::SINCOS_F32; break;
- case MVT::f64: LC = RTLIB::SINCOS_F64; break;
- case MVT::f80: LC = RTLIB::SINCOS_F80; break;
- case MVT::f128: LC = RTLIB::SINCOS_F128; break;
- case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
- }
+ RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getSimpleValueType(0).SimpleTy);
return TLI.getLibcallName(LC) != nullptr;
}
@@ -2355,68 +2347,73 @@ static bool useSinCos(SDNode *Node) {
}
/// Issue libcalls to sincos to compute sin / cos pairs.
-void
-SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
- SmallVectorImpl<SDValue> &Results) {
- RTLIB::Libcall LC;
- switch (Node->getSimpleValueType(0).SimpleTy) {
- default: llvm_unreachable("Unexpected request for libcall!");
- case MVT::f32: LC = RTLIB::SINCOS_F32; break;
- case MVT::f64: LC = RTLIB::SINCOS_F64; break;
- case MVT::f80: LC = RTLIB::SINCOS_F80; break;
- case MVT::f128: LC = RTLIB::SINCOS_F128; break;
- case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
- }
-
- // The input chain to this libcall is the entry node of the function.
- // Legalizing the call will automatically add the previous call to the
- // dependence.
- SDValue InChain = DAG.getEntryNode();
-
+void SelectionDAGLegalize::ExpandSinCosLibCall(
+ SDNode *Node, SmallVectorImpl<SDValue> &Results) {
EVT RetVT = Node->getValueType(0);
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
TargetLowering::ArgListTy Args;
- TargetLowering::ArgListEntry Entry;
+ TargetLowering::ArgListEntry Entry{};
+
+ // Find users of the node that store the results. The destination pointers
+ // can be used instead of creating stack allocations.
+ StoreSDNode *SinST = nullptr;
+ StoreSDNode *CosST = nullptr;
+ for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
+ UI != UE; ++UI) {
+ SDUse &Use = UI.getUse();
+ SDNode *User = Use.getUser();
+ if (!ISD::isNormalStore(User))
+ continue;
+ auto *ST = cast<StoreSDNode>(User);
+ if (Use.getResNo() == 0)
+ SinST = ST;
+ if (Use.getResNo() == 1)
+ CosST = ST;
+ }
// Pass the argument.
Entry.Node = Node->getOperand(0);
Entry.Ty = RetTy;
- Entry.IsSExt = false;
- Entry.IsZExt = false;
Args.push_back(Entry);
+ auto GetOrCreateOutPointer = [&](StoreSDNode *MaybeStore) {
+ if (MaybeStore)
+ return std::make_pair(MaybeStore->getBasePtr(),
+ MaybeStore->getPointerInfo());
+ SDValue StackSlot = DAG.CreateStackTemporary(RetVT);
+ auto PtrInfo = MachinePointerInfo::getFixedStack(
+ DAG.getMachineFunction(),
+ cast<FrameIndexSDNode>(StackSlot)->getIndex());
+ return std::make_pair(StackSlot, PtrInfo);
+ };
+
// Pass the return address of sin.
- SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
- Entry.Node = SinPtr;
+ auto SinPtr = GetOrCreateOutPointer(SinST);
+ Entry.Node = SinPtr.first;
Entry.Ty = PointerType::getUnqual(RetTy->getContext());
- Entry.IsSExt = false;
- Entry.IsZExt = false;
Args.push_back(Entry);
// Also pass the return address of the cos.
- SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
- Entry.Node = CosPtr;
+ auto CosPtr = GetOrCreateOutPointer(CosST);
+ Entry.Node = CosPtr.first;
Entry.Ty = PointerType::getUnqual(RetTy->getContext());
- Entry.IsSExt = false;
- Entry.IsZExt = false;
Args.push_back(Entry);
- SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
- TLI.getPointerTy(DAG.getDataLayout()));
-
- SDLoc dl(Node);
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
- TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
- std::move(Args));
+ RTLIB::Libcall LC = RTLIB::getFSINCOS(RetVT);
+ auto [Call, Chain] = ExpandLibCall(LC, Node, std::move(Args), false);
- std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+ // Replace explict stores with the library call.
+ for (StoreSDNode *ST : {SinST, CosST}) {
+ if (ST)
+ DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), Chain);
+ }
- Results.push_back(
- DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
- Results.push_back(
- DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
+ SDLoc DL(Node);
+ for (auto [Ptr, PtrInfo] : {SinPtr, CosPtr}) {
+ SDValue LoadExp = DAG.getLoad(RetVT, DL, Chain, Ptr, PtrInfo);
+ Results.push_back(LoadExp);
+ }
}
SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index eb3190c7cd247a..8e5a0a0ca06082 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -398,6 +398,11 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) {
FREXP_PPCF128);
}
+RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) {
+ return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128,
+ SINCOS_PPCF128);
+}
+
RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
AtomicOrdering Order,
uint64_t MemSize) {
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
new file mode 100644
index 00000000000000..afd054a83a5014
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+define { float, float } @sincos_f32_value_return(float %x) {
+; CHECK-LABEL: sincos_f32_value_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x0, sp, #12
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldp s1, s0, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ %ret_0 = insertvalue { float, float } poison, float %sin, 0
+ %ret_1 = insertvalue { float, float } %ret_0, float %cos, 1
+ ret { float, float } %ret_1
+}
+
+define void @sincos_f32_ptr_return(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: sincos_f32_ptr_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ store float %sin, ptr %out_sin, align 4
+ store float %cos, ptr %out_cos, align 4
+ ret void
+}
+
+define float @sincos_f32_mixed_return(float %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f32_mixed_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x1, sp, #12
+; CHECK-NEXT: bl sincosf
+; CHECK-NEXT: ldr s0, [sp, #12]
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call float @llvm.sin.f32(float %x)
+ %cos = tail call float @llvm.cos.f32(float %x)
+ store float %sin, ptr %out_sin, align 4
+ ret float %cos
+}
+
+define { double, double } @sincos_f64_value_return(double %x) {
+; CHECK-LABEL: sincos_f64_value_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x0, sp, #24
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincos
+; CHECK-NEXT: ldr d0, [sp, #24]
+; CHECK-NEXT: ldr d1, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call double @llvm.sin.f64(double %x)
+ %cos = tail call double @llvm.cos.f64(double %x)
+ %ret_0 = insertvalue { double, double } poison, double %sin, 0
+ %ret_1 = insertvalue { double, double } %ret_0, double %cos, 1
+ ret { double, double } %ret_1
+}
+
+define void @sincos_f64_ptr_return(double %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: sincos_f64_ptr_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl sincos
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call double @llvm.sin.f64(double %x)
+ %cos = tail call double @llvm.cos.f64(double %x)
+ store double %sin, ptr %out_sin, align 4
+ store double %cos, ptr %out_cos, align 4
+ ret void
+}
+
+define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f64_mixed_return:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x1, sp, #8
+; CHECK-NEXT: bl sincos
+; CHECK-NEXT: ldr d0, [sp, #8]
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %sin = tail call double @llvm.sin.f64(double %x)
+ %cos = tail call double @llvm.cos.f64(double %x)
+ store double %sin, ptr %out_sin, align 4
+ ret double %cos
+}
|
Note: See the second commit for the diff between the precommit and the new changes (diff here). |
Do we have anything else that could use this? modf comes to mind, but I don't think we have RTLIB support for that. |
FFREXP has the same issue due to materializing a stack slot for the |
9855e54
to
f46d132
Compare
596fc79
to
74113f0
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
5e63a71
to
f01dbd1
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can drop all of the chain logic and just use DAG.getEntryNode. FSINCOS is derived from no-memory/errno operations, and thus assumed to not care about the state of errno. You don't need all of this code trying to figure out the suitable chain
I don't think that's safe with the stores. Given this IR (which contains a sincos intrinsic that lowers to FSINCOS): define void @test(float %v, ptr %x, ptr %y) {
call void @foo(ptr %x, ptr %y)
%sret = call { float, float } @llvm.sincos.f32(float %v)
%ret0 = extractvalue { float, float } %sret, 0
%ret1 = extractvalue { float, float } %sret, 1
store float %ret0, ptr %x, align 4
store float %ret1, ptr %y, align 4
ret void
}
declare void @foo(ptr, ptr) The stores for the results of |
When lowering `FSINCOS` to a library call (that takes output pointers) we can avoid creating new stack allocations if the results of the `FSINCOS` are being stored. Instead, we can take the destination pointers from the stores and pass those to the library call.
f01dbd1
to
052fddd
Compare
052fddd
to
e15c4e3
Compare
We found an interesting issue after this change: #115323 |
Thanks, I'll take a look 👍 |
When lowering
FSINCOS
to a library call (that takes output pointers) we can avoid creating new stack allocations if the results of theFSINCOS
are being stored. Instead, we can take the destination pointers from the stores and pass those to the library call.Note: As a NFC this also adds (and uses)
RTLIB::getFSINCOS()
.